{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# <span style=color:blue>How to read data from different text based (and non-text based) sources</span>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read data from a CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.read_csv(\"CSV_EX_1.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read data from a CSV where headers are missing "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>2</th>\n",
       "      <th>1500</th>\n",
       "      <th>Good</th>\n",
       "      <th>300000</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   2   1500        Good   300000\n",
       "0  3   1300        Fair   240000\n",
       "1  3   1900   Very good   450000\n",
       "2  3   1850         Bad   280000\n",
       "3  2   1640        Good   310000"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.read_csv(\"CSV_EX_2.csv\")\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0     1           2       3\n",
       "0  2  1500        Good  300000\n",
       "1  3  1300        Fair  240000\n",
       "2  3  1900   Very good  450000\n",
       "3  3  1850         Bad  280000\n",
       "4  2  1640        Good  310000"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.read_csv(\"CSV_EX_2.csv\",header=None)\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq.ft</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom  Sq.ft    Locality  Price($)\n",
       "0        2   1500        Good    300000\n",
       "1        3   1300        Fair    240000\n",
       "2        3   1900   Very good    450000\n",
       "3        3   1850         Bad    280000\n",
       "4        2   1640        Good    310000"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.read_csv(\"CSV_EX_2.csv\",header=None, names=['Bedroom','Sq.ft','Locality','Price($)'])\n",
    "df2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Read data from a CSV where delimiters/separators are not comma "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom; Sq. foot; Locality; Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2; 1500; Good; 300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3; 1300; Fair; 240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3; 1900; Very good; 450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3; 1850; Bad; 280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2; 1640; Good; 310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Bedroom; Sq. foot; Locality; Price ($)\n",
       "0                  2; 1500; Good; 300000\n",
       "1                  3; 1300; Fair; 240000\n",
       "2             3; 1900; Very good; 450000\n",
       "3                   3; 1850; Bad; 280000\n",
       "4                  2; 1640; Good; 310000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3 = pd.read_csv(\"CSV_EX_3.csv\")\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3 = pd.read_csv(\"CSV_EX_3.csv\",sep=';')\n",
    "df3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to bypass given headers with your own? "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bedroom</td>\n",
       "      <td>Sq. foot</td>\n",
       "      <td>Locality</td>\n",
       "      <td>Price ($)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         A          B           C           D\n",
       "0  Bedroom   Sq. foot    Locality   Price ($)\n",
       "1        2       1500        Good      300000\n",
       "2        3       1300        Fair      240000\n",
       "3        3       1900   Very good      450000\n",
       "4        3       1850         Bad      280000\n",
       "5        2       1640        Good      310000"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df4 = pd.read_csv(\"CSV_EX_1.csv\",names=['A','B','C','D'])\n",
    "df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A     B           C       D\n",
       "0  2  1500        Good  300000\n",
       "1  3  1300        Fair  240000\n",
       "2  3  1900   Very good  450000\n",
       "3  3  1850         Bad  280000\n",
       "4  2  1640        Good  310000"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df4 = pd.read_csv(\"CSV_EX_1.csv\",header=0,names=['A','B','C','D'])\n",
    "df4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Skip initial rows "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Filetype: CSV</th>\n",
       "      <th>Unnamed: 1</th>\n",
       "      <th>Unnamed: 2</th>\n",
       "      <th>Unnamed: 3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Info about some houses</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bedroom</td>\n",
       "      <td>Sq. foot</td>\n",
       "      <td>Locality</td>\n",
       "      <td>Price ($)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Filetype: CSV              Unnamed: 1  Unnamed: 2  Unnamed: 3\n",
       "0           NaN  Info about some houses         NaN         NaN\n",
       "1       Bedroom                Sq. foot    Locality   Price ($)\n",
       "2             2                    1500        Good      300000\n",
       "3             3                    1300        Fair      240000\n",
       "4             3                    1900   Very good      450000\n",
       "5             3                    1850         Bad      280000\n",
       "6             2                    1640        Good      310000"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df5 = pd.read_csv(\"CSV_EX_skiprows.csv\")\n",
    "df5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df5 = pd.read_csv(\"CSV_EX_skiprows.csv\",skiprows=2)\n",
    "df5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Skip footers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Filetype: CSV</th>\n",
       "      <th>Unnamed: 1</th>\n",
       "      <th>Unnamed: 2</th>\n",
       "      <th>Unnamed: 3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Info about some houses</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bedroom</td>\n",
       "      <td>Sq. foot</td>\n",
       "      <td>Locality</td>\n",
       "      <td>Price ($)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>This is the end of file</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Filetype: CSV                Unnamed: 1  Unnamed: 2  Unnamed: 3\n",
       "0           NaN    Info about some houses         NaN         NaN\n",
       "1       Bedroom                  Sq. foot    Locality   Price ($)\n",
       "2             2                      1500        Good      300000\n",
       "3             3                      1300        Fair      240000\n",
       "4             3                      1900   Very good      450000\n",
       "5             3                      1850         Bad      280000\n",
       "6             2                      1640        Good      310000\n",
       "7           NaN   This is the end of file         NaN         NaN"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df6 = pd.read_csv(\"CSV_EX_skipfooter.csv\")\n",
    "df6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df6 = pd.read_csv(\"CSV_EX_skipfooter.csv\",skiprows=2,skipfooter=1,engine='python')\n",
    "df6"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read only first *n* rows (especially useful for large files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot  Locality   Price ($)\n",
       "0        2       1500      Good      300000\n",
       "1        3       1300      Fair      240000"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df7 = pd.read_csv(\"CSV_EX_1.csv\",nrows=2)\n",
    "df7"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to combine `skiprows` and `nrows` to read data in small chunks  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# List where DataFrames will be stored\n",
    "list_of_dataframe = []\n",
    "# Number of rows to be read in one chunk\n",
    "rows_in_a_chunk = 10\n",
    "# Number of chunks to be read (this many separate DataFrames will be produced)\n",
    "num_chunks = 5\n",
    "# Dummy DataFrame to get the column names\n",
    "df_dummy = pd.read_csv(\"Boston_housing.csv\",nrows=2)\n",
    "colnames = df_dummy.columns\n",
    "# Loop over the CSV file to read only specified number of rows at a time\n",
    "# Note how the iterator variable i is set up inside the range\n",
    "for i in range(0,num_chunks*rows_in_a_chunk,rows_in_a_chunk):\n",
    "    df = pd.read_csv(\"Boston_housing.csv\",header=0,skiprows=i,nrows=rows_in_a_chunk,names=colnames)\n",
    "    list_of_dataframe.append(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>B</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>PRICE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00632</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.31</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.575</td>\n",
       "      <td>65.2</td>\n",
       "      <td>4.0900</td>\n",
       "      <td>1</td>\n",
       "      <td>296</td>\n",
       "      <td>15.3</td>\n",
       "      <td>396.90</td>\n",
       "      <td>4.98</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02731</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>6.421</td>\n",
       "      <td>78.9</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2</td>\n",
       "      <td>242</td>\n",
       "      <td>17.8</td>\n",
       "      <td>396.90</td>\n",
       "      <td>9.14</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.02729</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>7.185</td>\n",
       "      <td>61.1</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2</td>\n",
       "      <td>242</td>\n",
       "      <td>17.8</td>\n",
       "      <td>392.83</td>\n",
       "      <td>4.03</td>\n",
       "      <td>34.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.03237</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.998</td>\n",
       "      <td>45.8</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>394.63</td>\n",
       "      <td>2.94</td>\n",
       "      <td>33.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.06905</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>7.147</td>\n",
       "      <td>54.2</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>396.90</td>\n",
       "      <td>5.33</td>\n",
       "      <td>36.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.02985</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.430</td>\n",
       "      <td>58.7</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>394.12</td>\n",
       "      <td>5.21</td>\n",
       "      <td>28.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.08829</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.012</td>\n",
       "      <td>66.6</td>\n",
       "      <td>5.5605</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>395.60</td>\n",
       "      <td>12.43</td>\n",
       "      <td>22.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.14455</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.172</td>\n",
       "      <td>96.1</td>\n",
       "      <td>5.9505</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>396.90</td>\n",
       "      <td>19.15</td>\n",
       "      <td>27.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.21124</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>5.631</td>\n",
       "      <td>100.0</td>\n",
       "      <td>6.0821</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>386.63</td>\n",
       "      <td>29.93</td>\n",
       "      <td>16.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.17004</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.004</td>\n",
       "      <td>85.9</td>\n",
       "      <td>6.5921</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>386.71</td>\n",
       "      <td>17.10</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD  TAX  PTRATIO  \\\n",
       "0  0.00632  18.0   2.31     0  0.538  6.575   65.2  4.0900    1  296     15.3   \n",
       "1  0.02731   0.0   7.07     0  0.469  6.421   78.9  4.9671    2  242     17.8   \n",
       "2  0.02729   0.0   7.07     0  0.469  7.185   61.1  4.9671    2  242     17.8   \n",
       "3  0.03237   0.0   2.18     0  0.458  6.998   45.8  6.0622    3  222     18.7   \n",
       "4  0.06905   0.0   2.18     0  0.458  7.147   54.2  6.0622    3  222     18.7   \n",
       "5  0.02985   0.0   2.18     0  0.458  6.430   58.7  6.0622    3  222     18.7   \n",
       "6  0.08829  12.5   7.87     0  0.524  6.012   66.6  5.5605    5  311     15.2   \n",
       "7  0.14455  12.5   7.87     0  0.524  6.172   96.1  5.9505    5  311     15.2   \n",
       "8  0.21124  12.5   7.87     0  0.524  5.631  100.0  6.0821    5  311     15.2   \n",
       "9  0.17004  12.5   7.87     0  0.524  6.004   85.9  6.5921    5  311     15.2   \n",
       "\n",
       "        B  LSTAT  PRICE  \n",
       "0  396.90   4.98   24.0  \n",
       "1  396.90   9.14   21.6  \n",
       "2  392.83   4.03   34.7  \n",
       "3  394.63   2.94   33.4  \n",
       "4  396.90   5.33   36.2  \n",
       "5  394.12   5.21   28.7  \n",
       "6  395.60  12.43   22.9  \n",
       "7  396.90  19.15   27.1  \n",
       "8  386.63  29.93   16.5  \n",
       "9  386.71  17.10   18.9  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_of_dataframe[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>B</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>PRICE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.22489</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.377</td>\n",
       "      <td>94.3</td>\n",
       "      <td>6.3467</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>392.52</td>\n",
       "      <td>20.45</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.11747</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.009</td>\n",
       "      <td>82.9</td>\n",
       "      <td>6.2267</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>396.90</td>\n",
       "      <td>13.27</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.09378</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>5.889</td>\n",
       "      <td>39.0</td>\n",
       "      <td>5.4509</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>390.50</td>\n",
       "      <td>15.71</td>\n",
       "      <td>21.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.62976</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>5.949</td>\n",
       "      <td>61.8</td>\n",
       "      <td>4.7075</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>396.90</td>\n",
       "      <td>8.26</td>\n",
       "      <td>20.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.63796</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.096</td>\n",
       "      <td>84.5</td>\n",
       "      <td>4.4619</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>380.02</td>\n",
       "      <td>10.26</td>\n",
       "      <td>18.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.62739</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>5.834</td>\n",
       "      <td>56.5</td>\n",
       "      <td>4.4986</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>395.62</td>\n",
       "      <td>8.47</td>\n",
       "      <td>19.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1.05393</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>5.935</td>\n",
       "      <td>29.3</td>\n",
       "      <td>4.4986</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>386.85</td>\n",
       "      <td>6.58</td>\n",
       "      <td>23.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.78420</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>5.990</td>\n",
       "      <td>81.7</td>\n",
       "      <td>4.2579</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>386.75</td>\n",
       "      <td>14.67</td>\n",
       "      <td>17.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.80271</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>5.456</td>\n",
       "      <td>36.6</td>\n",
       "      <td>3.7965</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>288.99</td>\n",
       "      <td>11.69</td>\n",
       "      <td>20.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.72580</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>5.727</td>\n",
       "      <td>69.5</td>\n",
       "      <td>3.7965</td>\n",
       "      <td>4</td>\n",
       "      <td>307</td>\n",
       "      <td>21.0</td>\n",
       "      <td>390.95</td>\n",
       "      <td>11.28</td>\n",
       "      <td>18.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \\\n",
       "0  0.22489  12.5   7.87     0  0.524  6.377  94.3  6.3467    5  311     15.2   \n",
       "1  0.11747  12.5   7.87     0  0.524  6.009  82.9  6.2267    5  311     15.2   \n",
       "2  0.09378  12.5   7.87     0  0.524  5.889  39.0  5.4509    5  311     15.2   \n",
       "3  0.62976   0.0   8.14     0  0.538  5.949  61.8  4.7075    4  307     21.0   \n",
       "4  0.63796   0.0   8.14     0  0.538  6.096  84.5  4.4619    4  307     21.0   \n",
       "5  0.62739   0.0   8.14     0  0.538  5.834  56.5  4.4986    4  307     21.0   \n",
       "6  1.05393   0.0   8.14     0  0.538  5.935  29.3  4.4986    4  307     21.0   \n",
       "7  0.78420   0.0   8.14     0  0.538  5.990  81.7  4.2579    4  307     21.0   \n",
       "8  0.80271   0.0   8.14     0  0.538  5.456  36.6  3.7965    4  307     21.0   \n",
       "9  0.72580   0.0   8.14     0  0.538  5.727  69.5  3.7965    4  307     21.0   \n",
       "\n",
       "        B  LSTAT  PRICE  \n",
       "0  392.52  20.45   15.0  \n",
       "1  396.90  13.27   18.9  \n",
       "2  390.50  15.71   21.7  \n",
       "3  396.90   8.26   20.4  \n",
       "4  380.02  10.26   18.2  \n",
       "5  395.62   8.47   19.9  \n",
       "6  386.85   6.58   23.1  \n",
       "7  386.75  14.67   17.5  \n",
       "8  288.99  11.69   20.2  \n",
       "9  390.95  11.28   18.2  "
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_of_dataframe[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting the option `skip_blank_lines`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df9 = pd.read_csv(\"CSV_EX_blankline.csv\")\n",
    "df9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1300.0</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1900.0</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1850.0</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1640.0</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0      2.0     1500.0        Good    300000.0\n",
       "1      3.0     1300.0        Fair    240000.0\n",
       "2      NaN        NaN         NaN         NaN\n",
       "3      3.0     1900.0   Very good    450000.0\n",
       "4      3.0     1850.0         Bad    280000.0\n",
       "5      NaN        NaN         NaN         NaN\n",
       "6      2.0     1640.0        Good    310000.0"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df9 = pd.read_csv(\"CSV_EX_blankline.csv\",skip_blank_lines=False)\n",
    "df9"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read CSV from inside a compressed (.zip/.gz/.bz2/.xz) file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df10 = pd.read_csv('CSV_EX_1.zip')\n",
    "df10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading from an Excel file - how to use `sheet_name`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "df11_1 = pd.read_excel(\"Housing_data.xlsx\",sheet_name='Data_Tab_1')\n",
    "df11_2 = pd.read_excel(\"Housing_data.xlsx\",sheet_name='Data_Tab_2')\n",
    "df11_3 = pd.read_excel(\"Housing_data.xlsx\",sheet_name='Data_Tab_3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9, 14)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df11_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 14)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df11_2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16, 14)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df11_3.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### If `sheet_name` is set to `None` then an Ordered Dictionary of DataFrame is returned if the Excel file has distinct sheets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_df = pd.read_excel(\"Housing_data.xlsx\",sheet_name=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "odict_keys(['Data_Tab_1', 'Data_Tab_2', 'Data_Tab_3'])"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dict_df.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### General delimated text file can be read same as a CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom, Sq. foot, Locality, Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2, 1500, Good, 300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3, 1300, Fair, 240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3, 1900, Very good, 450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3, 1850, Bad, 280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2, 1640, Good, 310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Bedroom, Sq. foot, Locality, Price ($)\n",
       "0                  2, 1500, Good, 300000\n",
       "1                  3, 1300, Fair, 240000\n",
       "2             3, 1900, Very good, 450000\n",
       "3                   3, 1850, Bad, 280000\n",
       "4                  2, 1640, Good, 310000"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df13 = pd.read_table(\"Table_EX_1.txt\")\n",
    "df13"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom   Sq. foot    Locality   Price ($)\n",
       "0        2       1500        Good      300000\n",
       "1        3       1300        Fair      240000\n",
       "2        3       1900   Very good      450000\n",
       "3        3       1850         Bad      280000\n",
       "4        2       1640        Good      310000"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df13 = pd.read_table(\"Table_EX_1.txt\",sep=',')\n",
    "df13"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bedroom</th>\n",
       "      <th>Sq. foot</th>\n",
       "      <th>Locality</th>\n",
       "      <th>Price ($)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>1500</td>\n",
       "      <td>Good</td>\n",
       "      <td>300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1300</td>\n",
       "      <td>Fair</td>\n",
       "      <td>240000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1900</td>\n",
       "      <td>Very good</td>\n",
       "      <td>450000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1850</td>\n",
       "      <td>Bad</td>\n",
       "      <td>280000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1640</td>\n",
       "      <td>Good</td>\n",
       "      <td>310000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bedroom  Sq. foot   Locality  Price ($)\n",
       "0        2      1500       Good     300000\n",
       "1        3      1300       Fair     240000\n",
       "2        3      1900  Very good     450000\n",
       "3        3      1850        Bad     280000\n",
       "4        2      1640       Good     310000"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df13 = pd.read_table(\"Table_tab_separated.txt\",)\n",
    "df13"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read HTML tables directly from an URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'\n",
    "list_of_df = pd.read_html(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bank Name</th>\n",
       "      <th>City</th>\n",
       "      <th>ST</th>\n",
       "      <th>CERT</th>\n",
       "      <th>Acquiring Institution</th>\n",
       "      <th>Closing Date</th>\n",
       "      <th>Updated Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Washington Federal Bank for Savings</td>\n",
       "      <td>Chicago</td>\n",
       "      <td>IL</td>\n",
       "      <td>30570</td>\n",
       "      <td>Royal Savings Bank</td>\n",
       "      <td>December 15, 2017</td>\n",
       "      <td>February 21, 2018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The Farmers and Merchants State Bank of Argonia</td>\n",
       "      <td>Argonia</td>\n",
       "      <td>KS</td>\n",
       "      <td>17719</td>\n",
       "      <td>Conway Bank</td>\n",
       "      <td>October 13, 2017</td>\n",
       "      <td>February 21, 2018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fayette County Bank</td>\n",
       "      <td>Saint Elmo</td>\n",
       "      <td>IL</td>\n",
       "      <td>1802</td>\n",
       "      <td>United Fidelity Bank, fsb</td>\n",
       "      <td>May 26, 2017</td>\n",
       "      <td>July 26, 2017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Guaranty Bank, (d/b/a BestBank in Georgia &amp; Mi...</td>\n",
       "      <td>Milwaukee</td>\n",
       "      <td>WI</td>\n",
       "      <td>30003</td>\n",
       "      <td>First-Citizens Bank &amp; Trust Company</td>\n",
       "      <td>May 5, 2017</td>\n",
       "      <td>March 22, 2018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>First NBC Bank</td>\n",
       "      <td>New Orleans</td>\n",
       "      <td>LA</td>\n",
       "      <td>58302</td>\n",
       "      <td>Whitney Bank</td>\n",
       "      <td>April 28, 2017</td>\n",
       "      <td>December 5, 2017</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           Bank Name         City  ST   CERT  \\\n",
       "0                Washington Federal Bank for Savings      Chicago  IL  30570   \n",
       "1    The Farmers and Merchants State Bank of Argonia      Argonia  KS  17719   \n",
       "2                                Fayette County Bank   Saint Elmo  IL   1802   \n",
       "3  Guaranty Bank, (d/b/a BestBank in Georgia & Mi...    Milwaukee  WI  30003   \n",
       "4                                     First NBC Bank  New Orleans  LA  58302   \n",
       "\n",
       "                 Acquiring Institution       Closing Date       Updated Date  \n",
       "0                   Royal Savings Bank  December 15, 2017  February 21, 2018  \n",
       "1                          Conway Bank   October 13, 2017  February 21, 2018  \n",
       "2            United Fidelity Bank, fsb       May 26, 2017      July 26, 2017  \n",
       "3  First-Citizens Bank & Trust Company        May 5, 2017     March 22, 2018  \n",
       "4                         Whitney Bank     April 28, 2017   December 5, 2017  "
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df14 = list_of_df[0]\n",
    "df14.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mostly, `read_html` returns more than one table and further wrangling is needed to get the desired data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_of_df = pd.read_html(\"https://en.wikipedia.org/wiki/2016_Summer_Olympics_medal_table\",header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list_of_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 1)\n",
      "(87, 6)\n",
      "(10, 8)\n",
      "(0, 2)\n",
      "(1, 1)\n",
      "(4, 2)\n"
     ]
    }
   ],
   "source": [
    "for t in list_of_df:\n",
    "    print(t.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>NOC</th>\n",
       "      <th>Gold</th>\n",
       "      <th>Silver</th>\n",
       "      <th>Bronze</th>\n",
       "      <th>Total</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>United States (USA)</td>\n",
       "      <td>46</td>\n",
       "      <td>37</td>\n",
       "      <td>38</td>\n",
       "      <td>121.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Great Britain (GBR)</td>\n",
       "      <td>27</td>\n",
       "      <td>23</td>\n",
       "      <td>17</td>\n",
       "      <td>67.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>China (CHN)</td>\n",
       "      <td>26</td>\n",
       "      <td>18</td>\n",
       "      <td>26</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Russia (RUS)</td>\n",
       "      <td>19</td>\n",
       "      <td>17</td>\n",
       "      <td>20</td>\n",
       "      <td>56.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Germany (GER)</td>\n",
       "      <td>17</td>\n",
       "      <td>10</td>\n",
       "      <td>15</td>\n",
       "      <td>42.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Rank                  NOC  Gold  Silver  Bronze  Total\n",
       "0    1  United States (USA)    46      37      38  121.0\n",
       "1    2  Great Britain (GBR)    27      23      17   67.0\n",
       "2    3          China (CHN)    26      18      26   70.0\n",
       "3    4         Russia (RUS)    19      17      20   56.0\n",
       "4    5        Germany (GER)    17      10      15   42.0"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df15=list_of_df[1]\n",
    "df15.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read in a JSON file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "df16 = pd.read_json(\"movies.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cast</th>\n",
       "      <th>genres</th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>After Dark in Central Park</td>\n",
       "      <td>1900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>Boarding School Girls' Pajama Parade</td>\n",
       "      <td>1900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>Buffalo Bill's Wild West Parad</td>\n",
       "      <td>1900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>Caught</td>\n",
       "      <td>1900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>Clowns Spinning Hats</td>\n",
       "      <td>1900</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  cast genres                                 title  year\n",
       "0   []     []            After Dark in Central Park  1900\n",
       "1   []     []  Boarding School Girls' Pajama Parade  1900\n",
       "2   []     []        Buffalo Bill's Wild West Parad  1900\n",
       "3   []     []                                Caught  1900\n",
       "4   []     []                  Clowns Spinning Hats  1900"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df16.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13519                           [Adele Mara, John Carroll]\n",
       "23778    [Ralph Fiennes, Uma Thurman, Sean Connery, Jim...\n",
       "27195    [Robert Downey, Jr., Chris Evans, Mark Ruffalo...\n",
       "Name: cast, dtype: object"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df16[df16['title']==\"The Avengers\"]['cast']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "cast_of_avengers=df16[(df16['title']==\"The Avengers\") & (df16['year']==2012)]['cast']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['Robert Downey, Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Tom Hiddleston', 'Clark Gregg', 'Cobie Smulders', 'Stellan SkarsgÃ¥rd', 'Samuel L. Jackson']]\n"
     ]
    }
   ],
   "source": [
    "print(list(cast_of_avengers))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read Stata file (.dta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "df17 = pd.read_stata(\"rscfp2016.dta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>YY1</th>\n",
       "      <th>Y1</th>\n",
       "      <th>wgt</th>\n",
       "      <th>hhsex</th>\n",
       "      <th>age</th>\n",
       "      <th>agecl</th>\n",
       "      <th>educ</th>\n",
       "      <th>edcl</th>\n",
       "      <th>married</th>\n",
       "      <th>kids</th>\n",
       "      <th>...</th>\n",
       "      <th>LLOAN11</th>\n",
       "      <th>LLOAN12</th>\n",
       "      <th>nwcat</th>\n",
       "      <th>inccat</th>\n",
       "      <th>assetcat</th>\n",
       "      <th>ninccat</th>\n",
       "      <th>NINC2CAT</th>\n",
       "      <th>nwpctlecat</th>\n",
       "      <th>incpctlecat</th>\n",
       "      <th>nincpctlecat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>6427.136676</td>\n",
       "      <td>2</td>\n",
       "      <td>71</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>6428.350592</td>\n",
       "      <td>2</td>\n",
       "      <td>71</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>6414.477294</td>\n",
       "      <td>2</td>\n",
       "      <td>71</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>14</td>\n",
       "      <td>6428.487972</td>\n",
       "      <td>2</td>\n",
       "      <td>71</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "      <td>6425.256822</td>\n",
       "      <td>2</td>\n",
       "      <td>71</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 348 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   YY1  Y1          wgt  hhsex  age  agecl  educ  edcl  married  kids  \\\n",
       "0    1  11  6427.136676      2   71      5    10     3        2     0   \n",
       "1    1  12  6428.350592      2   71      5    10     3        2     0   \n",
       "2    1  13  6414.477294      2   71      5    10     3        2     0   \n",
       "3    1  14  6428.487972      2   71      5    10     3        2     0   \n",
       "4    1  15  6425.256822      2   71      5    10     3        2     0   \n",
       "\n",
       "       ...       LLOAN11  LLOAN12  nwcat  inccat  assetcat  ninccat  NINC2CAT  \\\n",
       "0      ...             0        0      3       1         3        1         1   \n",
       "1      ...             0        0      3       1         3        1         1   \n",
       "2      ...             0        0      3       1         3        1         1   \n",
       "3      ...             0        0      3       1         3        1         1   \n",
       "4      ...             0        0      3       1         3        1         1   \n",
       "\n",
       "   nwpctlecat  incpctlecat  nincpctlecat  \n",
       "0           7            1             1  \n",
       "1           7            1             1  \n",
       "2           7            1             1  \n",
       "3           7            1             1  \n",
       "4           7            1             1  \n",
       "\n",
       "[5 rows x 348 columns]"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df17.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read tabular data from PDF file using the `Tabula-py` package\n",
    "\n",
    "For more information about `tabula-py`, please see this link: https://github.com/chezou/tabula-py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tabula import read_pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df18_1 = read_pdf('Housing_data.pdf',pages=[1],pandas_options={'header':None})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.17004</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.004</td>\n",
       "      <td>85.9</td>\n",
       "      <td>6.5921</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.22489</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.377</td>\n",
       "      <td>94.3</td>\n",
       "      <td>6.3467</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11747</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.009</td>\n",
       "      <td>82.9</td>\n",
       "      <td>6.2267</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.09378</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>5.889</td>\n",
       "      <td>39.0</td>\n",
       "      <td>5.4509</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         0     1     2  3      4      5     6       7  8    9\n",
       "0  0.17004  12.5  7.87  0  0.524  6.004  85.9  6.5921  5  311\n",
       "1  0.22489  12.5  7.87  0  0.524  6.377  94.3  6.3467  5  311\n",
       "2  0.11747  12.5  7.87  0  0.524  6.009  82.9  6.2267  5  311\n",
       "3  0.09378  12.5  7.87  0  0.524  5.889  39.0  5.4509  5  311"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df18_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "df18_2 = read_pdf('Housing_data.pdf',pages=[2],pandas_options={'header':None})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15.2</td>\n",
       "      <td>386.71</td>\n",
       "      <td>17.10</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15.2</td>\n",
       "      <td>392.52</td>\n",
       "      <td>20.45</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15.2</td>\n",
       "      <td>396.90</td>\n",
       "      <td>13.27</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15.2</td>\n",
       "      <td>390.50</td>\n",
       "      <td>15.71</td>\n",
       "      <td>21.7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      0       1      2     3\n",
       "0  15.2  386.71  17.10  18.9\n",
       "1  15.2  392.52  20.45  15.0\n",
       "2  15.2  396.90  13.27  18.9\n",
       "3  15.2  390.50  15.71  21.7"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df18_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "df18=pd.concat([df18_1,df18_2],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.17004</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.004</td>\n",
       "      <td>85.9</td>\n",
       "      <td>6.5921</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>386.71</td>\n",
       "      <td>17.10</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.22489</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.377</td>\n",
       "      <td>94.3</td>\n",
       "      <td>6.3467</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>392.52</td>\n",
       "      <td>20.45</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11747</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.009</td>\n",
       "      <td>82.9</td>\n",
       "      <td>6.2267</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>396.90</td>\n",
       "      <td>13.27</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.09378</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>5.889</td>\n",
       "      <td>39.0</td>\n",
       "      <td>5.4509</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>390.50</td>\n",
       "      <td>15.71</td>\n",
       "      <td>21.7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         0     1     2  3      4      5     6       7  8    9     0       1  \\\n",
       "0  0.17004  12.5  7.87  0  0.524  6.004  85.9  6.5921  5  311  15.2  386.71   \n",
       "1  0.22489  12.5  7.87  0  0.524  6.377  94.3  6.3467  5  311  15.2  392.52   \n",
       "2  0.11747  12.5  7.87  0  0.524  6.009  82.9  6.2267  5  311  15.2  396.90   \n",
       "3  0.09378  12.5  7.87  0  0.524  5.889  39.0  5.4509  5  311  15.2  390.50   \n",
       "\n",
       "       2     3  \n",
       "0  17.10  18.9  \n",
       "1  20.45  15.0  \n",
       "2  13.27  18.9  \n",
       "3  15.71  21.7  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df18"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### With PDF extraction, most of the time, headres will be difficult to extract automatically. You have to pass on the list of headres as the `names` argument in the `read-pdf` function as `pandas_option`,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "names=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','PRICE']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "df18_1 = read_pdf('Housing_data.pdf',pages=[1],pandas_options={'header':None,'names':names[:10]})\n",
    "df18_2 = read_pdf('Housing_data.pdf',pages=[2],pandas_options={'header':None,'names':names[10:]})\n",
    "df18=pd.concat([df18_1,df18_2],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>B</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>PRICE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.17004</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.004</td>\n",
       "      <td>85.9</td>\n",
       "      <td>6.5921</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>386.71</td>\n",
       "      <td>17.10</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.22489</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.377</td>\n",
       "      <td>94.3</td>\n",
       "      <td>6.3467</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>392.52</td>\n",
       "      <td>20.45</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11747</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>6.009</td>\n",
       "      <td>82.9</td>\n",
       "      <td>6.2267</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>396.90</td>\n",
       "      <td>13.27</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.09378</td>\n",
       "      <td>12.5</td>\n",
       "      <td>7.87</td>\n",
       "      <td>0</td>\n",
       "      <td>0.524</td>\n",
       "      <td>5.889</td>\n",
       "      <td>39.0</td>\n",
       "      <td>5.4509</td>\n",
       "      <td>5</td>\n",
       "      <td>311</td>\n",
       "      <td>15.2</td>\n",
       "      <td>390.50</td>\n",
       "      <td>15.71</td>\n",
       "      <td>21.7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \\\n",
       "0  0.17004  12.5   7.87     0  0.524  6.004  85.9  6.5921    5  311     15.2   \n",
       "1  0.22489  12.5   7.87     0  0.524  6.377  94.3  6.3467    5  311     15.2   \n",
       "2  0.11747  12.5   7.87     0  0.524  6.009  82.9  6.2267    5  311     15.2   \n",
       "3  0.09378  12.5   7.87     0  0.524  5.889  39.0  5.4509    5  311     15.2   \n",
       "\n",
       "        B  LSTAT  PRICE  \n",
       "0  386.71  17.10   18.9  \n",
       "1  392.52  20.45   15.0  \n",
       "2  396.90  13.27   18.9  \n",
       "3  390.50  15.71   21.7  "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df18"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise 19: In a complex page, you may have multiple tables and use Tabula to extract a list of DataFrames and then process the DataFrames further"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_of_tables=read_pdf(\"WDI-2016.pdf\",pages=70,multiple_tables=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(list_of_tables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Italy</td>\n",
       "      <td>60.8</td>\n",
       "      <td>301.3</td>\n",
       "      <td>207</td>\n",
       "      <td>69</td>\n",
       "      <td>2,102.2</td>\n",
       "      <td>34,580</td>\n",
       "      <td>2,155.2</td>\n",
       "      <td>35,450</td>\n",
       "      <td>–0.4</td>\n",
       "      <td>–1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jamaica</td>\n",
       "      <td>2.7</td>\n",
       "      <td>11.0</td>\n",
       "      <td>251</td>\n",
       "      <td>55</td>\n",
       "      <td>14.0</td>\n",
       "      <td>5,150</td>\n",
       "      <td>23.5</td>\n",
       "      <td>8,640</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Japan</td>\n",
       "      <td>127.1</td>\n",
       "      <td>378.0</td>\n",
       "      <td>349</td>\n",
       "      <td>93</td>\n",
       "      <td>5,339.1</td>\n",
       "      <td>42,000</td>\n",
       "      <td>4,846.7</td>\n",
       "      <td>38,120</td>\n",
       "      <td>–0.1</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Jordan</td>\n",
       "      <td>6.6</td>\n",
       "      <td>89.3</td>\n",
       "      <td>74</td>\n",
       "      <td>83</td>\n",
       "      <td>34.1</td>\n",
       "      <td>5,160</td>\n",
       "      <td>78.7</td>\n",
       "      <td>11,910</td>\n",
       "      <td>3.1</td>\n",
       "      <td>0.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Kazakhstan</td>\n",
       "      <td>17.3</td>\n",
       "      <td>2,724.9</td>\n",
       "      <td>6</td>\n",
       "      <td>53</td>\n",
       "      <td>204.8</td>\n",
       "      <td>11,850</td>\n",
       "      <td>375.3</td>\n",
       "      <td>21,710</td>\n",
       "      <td>4.4</td>\n",
       "      <td>2.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Kenya</td>\n",
       "      <td>44.9</td>\n",
       "      <td>580.4</td>\n",
       "      <td>79</td>\n",
       "      <td>25</td>\n",
       "      <td>58.1</td>\n",
       "      <td>1,290</td>\n",
       "      <td>131.8</td>\n",
       "      <td>2,940</td>\n",
       "      <td>5.3</td>\n",
       "      <td>2.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Kiribati</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.8</td>\n",
       "      <td>136</td>\n",
       "      <td>44</td>\n",
       "      <td>0.3</td>\n",
       "      <td>2,950</td>\n",
       "      <td>0.4a</td>\n",
       "      <td>3,340a</td>\n",
       "      <td>3.7</td>\n",
       "      <td>1.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Korea, Dem. People’s Rep.</td>\n",
       "      <td>25.0</td>\n",
       "      <td>120.5</td>\n",
       "      <td>208</td>\n",
       "      <td>61</td>\n",
       "      <td>..</td>\n",
       "      <td>..j</td>\n",
       "      <td>..</td>\n",
       "      <td>..</td>\n",
       "      <td>..</td>\n",
       "      <td>..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Korea, Rep.</td>\n",
       "      <td>50.4</td>\n",
       "      <td>100.3</td>\n",
       "      <td>517</td>\n",
       "      <td>82</td>\n",
       "      <td>1,365.8</td>\n",
       "      <td>27,090</td>\n",
       "      <td>1,697.0</td>\n",
       "      <td>33,650</td>\n",
       "      <td>3.3</td>\n",
       "      <td>2.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Kosovo</td>\n",
       "      <td>1.8</td>\n",
       "      <td>10.9</td>\n",
       "      <td>167</td>\n",
       "      <td>..</td>\n",
       "      <td>7.3</td>\n",
       "      <td>3,990</td>\n",
       "      <td>17.0a</td>\n",
       "      <td>9,300a</td>\n",
       "      <td>1.2</td>\n",
       "      <td>0.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          0      1        2    3   4        5       6   \\\n",
       "0                      Italy   60.8    301.3  207  69  2,102.2  34,580   \n",
       "1                    Jamaica    2.7     11.0  251  55     14.0   5,150   \n",
       "2                      Japan  127.1    378.0  349  93  5,339.1  42,000   \n",
       "3                     Jordan    6.6     89.3   74  83     34.1   5,160   \n",
       "4                 Kazakhstan   17.3  2,724.9    6  53    204.8  11,850   \n",
       "5                      Kenya   44.9    580.4   79  25     58.1   1,290   \n",
       "6                   Kiribati    0.1      0.8  136  44      0.3   2,950   \n",
       "7  Korea, Dem. People’s Rep.   25.0    120.5  208  61       ..     ..j   \n",
       "8                Korea, Rep.   50.4    100.3  517  82  1,365.8  27,090   \n",
       "9                     Kosovo    1.8     10.9  167  ..      7.3   3,990   \n",
       "\n",
       "        7       8     9     10  \n",
       "0  2,155.2  35,450  –0.4  –1.4  \n",
       "1     23.5   8,640   0.7   0.5  \n",
       "2  4,846.7  38,120  –0.1   0.1  \n",
       "3     78.7  11,910   3.1   0.8  \n",
       "4    375.3  21,710   4.4   2.9  \n",
       "5    131.8   2,940   5.3   2.6  \n",
       "6     0.4a  3,340a   3.7   1.9  \n",
       "7       ..      ..    ..    ..  \n",
       "8  1,697.0  33,650   3.3   2.9  \n",
       "9    17.0a  9,300a   1.2   0.9  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_of_tables[1].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Population</th>\n",
       "      <th>Surface area</th>\n",
       "      <th>Population density</th>\n",
       "      <th>Urban pop %</th>\n",
       "      <th>GNI Atlas Method (Billions)</th>\n",
       "      <th>GNI Atlas Method (Per capita)</th>\n",
       "      <th>Purchasing power (Billions)</th>\n",
       "      <th>Purchasing power (Per capita)</th>\n",
       "      <th>GDP % growth</th>\n",
       "      <th>GDP per capita growth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Italy</td>\n",
       "      <td>60.8</td>\n",
       "      <td>301.3</td>\n",
       "      <td>207</td>\n",
       "      <td>69</td>\n",
       "      <td>2,102.2</td>\n",
       "      <td>34,580</td>\n",
       "      <td>2,155.2</td>\n",
       "      <td>35,450</td>\n",
       "      <td>–0.4</td>\n",
       "      <td>–1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jamaica</td>\n",
       "      <td>2.7</td>\n",
       "      <td>11.0</td>\n",
       "      <td>251</td>\n",
       "      <td>55</td>\n",
       "      <td>14.0</td>\n",
       "      <td>5,150</td>\n",
       "      <td>23.5</td>\n",
       "      <td>8,640</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Japan</td>\n",
       "      <td>127.1</td>\n",
       "      <td>378.0</td>\n",
       "      <td>349</td>\n",
       "      <td>93</td>\n",
       "      <td>5,339.1</td>\n",
       "      <td>42,000</td>\n",
       "      <td>4,846.7</td>\n",
       "      <td>38,120</td>\n",
       "      <td>–0.1</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Jordan</td>\n",
       "      <td>6.6</td>\n",
       "      <td>89.3</td>\n",
       "      <td>74</td>\n",
       "      <td>83</td>\n",
       "      <td>34.1</td>\n",
       "      <td>5,160</td>\n",
       "      <td>78.7</td>\n",
       "      <td>11,910</td>\n",
       "      <td>3.1</td>\n",
       "      <td>0.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Kazakhstan</td>\n",
       "      <td>17.3</td>\n",
       "      <td>2,724.9</td>\n",
       "      <td>6</td>\n",
       "      <td>53</td>\n",
       "      <td>204.8</td>\n",
       "      <td>11,850</td>\n",
       "      <td>375.3</td>\n",
       "      <td>21,710</td>\n",
       "      <td>4.4</td>\n",
       "      <td>2.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Kenya</td>\n",
       "      <td>44.9</td>\n",
       "      <td>580.4</td>\n",
       "      <td>79</td>\n",
       "      <td>25</td>\n",
       "      <td>58.1</td>\n",
       "      <td>1,290</td>\n",
       "      <td>131.8</td>\n",
       "      <td>2,940</td>\n",
       "      <td>5.3</td>\n",
       "      <td>2.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Kiribati</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.8</td>\n",
       "      <td>136</td>\n",
       "      <td>44</td>\n",
       "      <td>0.3</td>\n",
       "      <td>2,950</td>\n",
       "      <td>0.4a</td>\n",
       "      <td>3,340a</td>\n",
       "      <td>3.7</td>\n",
       "      <td>1.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Korea, Dem. People’s Rep.</td>\n",
       "      <td>25.0</td>\n",
       "      <td>120.5</td>\n",
       "      <td>208</td>\n",
       "      <td>61</td>\n",
       "      <td>..</td>\n",
       "      <td>..j</td>\n",
       "      <td>..</td>\n",
       "      <td>..</td>\n",
       "      <td>..</td>\n",
       "      <td>..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Korea, Rep.</td>\n",
       "      <td>50.4</td>\n",
       "      <td>100.3</td>\n",
       "      <td>517</td>\n",
       "      <td>82</td>\n",
       "      <td>1,365.8</td>\n",
       "      <td>27,090</td>\n",
       "      <td>1,697.0</td>\n",
       "      <td>33,650</td>\n",
       "      <td>3.3</td>\n",
       "      <td>2.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Kosovo</td>\n",
       "      <td>1.8</td>\n",
       "      <td>10.9</td>\n",
       "      <td>167</td>\n",
       "      <td>..</td>\n",
       "      <td>7.3</td>\n",
       "      <td>3,990</td>\n",
       "      <td>17.0a</td>\n",
       "      <td>9,300a</td>\n",
       "      <td>1.2</td>\n",
       "      <td>0.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Country Population Surface area Population density  \\\n",
       "0                      Italy       60.8        301.3                207   \n",
       "1                    Jamaica        2.7         11.0                251   \n",
       "2                      Japan      127.1        378.0                349   \n",
       "3                     Jordan        6.6         89.3                 74   \n",
       "4                 Kazakhstan       17.3      2,724.9                  6   \n",
       "5                      Kenya       44.9        580.4                 79   \n",
       "6                   Kiribati        0.1          0.8                136   \n",
       "7  Korea, Dem. People’s Rep.       25.0        120.5                208   \n",
       "8                Korea, Rep.       50.4        100.3                517   \n",
       "9                     Kosovo        1.8         10.9                167   \n",
       "\n",
       "  Urban pop % GNI Atlas Method (Billions) GNI Atlas Method (Per capita)  \\\n",
       "0          69                     2,102.2                        34,580   \n",
       "1          55                        14.0                         5,150   \n",
       "2          93                     5,339.1                        42,000   \n",
       "3          83                        34.1                         5,160   \n",
       "4          53                       204.8                        11,850   \n",
       "5          25                        58.1                         1,290   \n",
       "6          44                         0.3                         2,950   \n",
       "7          61                          ..                           ..j   \n",
       "8          82                     1,365.8                        27,090   \n",
       "9          ..                         7.3                         3,990   \n",
       "\n",
       "  Purchasing power (Billions) Purchasing power (Per capita) GDP % growth  \\\n",
       "0                     2,155.2                        35,450         –0.4   \n",
       "1                        23.5                         8,640          0.7   \n",
       "2                     4,846.7                        38,120         –0.1   \n",
       "3                        78.7                        11,910          3.1   \n",
       "4                       375.3                        21,710          4.4   \n",
       "5                       131.8                         2,940          5.3   \n",
       "6                        0.4a                        3,340a          3.7   \n",
       "7                          ..                            ..           ..   \n",
       "8                     1,697.0                        33,650          3.3   \n",
       "9                       17.0a                        9,300a          1.2   \n",
       "\n",
       "  GDP per capita growth  \n",
       "0                  –1.4  \n",
       "1                   0.5  \n",
       "2                   0.1  \n",
       "3                   0.8  \n",
       "4                   2.9  \n",
       "5                   2.6  \n",
       "6                   1.9  \n",
       "7                    ..  \n",
       "8                   2.9  \n",
       "9                   0.9  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df19 = list_of_tables[1]\n",
    "df19.columns = ['Country','Population','Surface area','Population density','Urban pop %','GNI Atlas Method (Billions)',\n",
    "                'GNI Atlas Method (Per capita)','Purchasing power (Billions)','Purchasing power (Per capita)',\n",
    "               'GDP % growth', 'GDP per capita growth']\n",
    "df19.head(10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
